This notebook contains analysis of the NHANES 2017–2020 study.
Each section has graphs and tables to attempt to find insightful conclusions
Import libraries
import pandas as pd
import numpy as np
import seaborn as sns
import openpyxl
import xport
import plotly.graph_objects as go
import plotly.express as px
from functools import reduce
import matplotlib.pyplot as plt
import scipy as sc
from IPython.display import display, HTML
Functions to help with analysis and graphs
def remove_outlier(df_in, col_name):
q1 = df_in[col_name].quantile(0.25)
q3 = df_in[col_name].quantile(0.75)
iqr = q3-q1
fence_low = q1-1.5*iqr
fence_high = q3+1.5*iqr
df_out = df_in.loc[(df_in[col_name] > fence_low) & (df_in[col_name] < fence_high)]
return df_out
def side_by_side(*dfs):
html = '<div style="display:flex">'
for df in dfs:
html += '<div style="margin-right: 2em">'
html += df.to_html()
html += '</div>'
html += '</div>'
display(HTML(html))
Load in each dataset from NHANES
with open('DEMO.xpt', 'rb') as f:
df_demog = xport.to_dataframe(f)
with open('DATA\questionaire\diet_behavior_nutrition.XPT', 'rb') as f:
df_diet_plan_nutrition = xport.to_dataframe(f)
with open('DATA\questionaire\smoking_QUES.xpt', 'rb') as f:
df_smoking = xport.to_dataframe(f)
with open('DATA\questionaire\health_insurance.XPT', 'rb') as f:
df_health_insurance = xport.to_dataframe(f)
with open('DATA\questionaire\oral_health.XPT', 'rb') as f:
df_oral_health = xport.to_dataframe(f)
with open('DATA\questionaire\physical_activity.XPT', 'rb') as f:
df_physical_activity = xport.to_dataframe(f)
with open('DATA\questionaire\occupation.XPT', 'rb') as f:
df_occupation = xport.to_dataframe(f)
with open('DATA\questionaire\Alcohol.XPT', 'rb') as f:
df_alcohol = xport.to_dataframe(f)
with open('DATA\questionaire\Blood_pres_chol.XPT', 'rb') as f:
df_blood_cholest = xport.to_dataframe(f)
with open('DATA\examination\oral_exam.XPT', 'rb') as f:
df_oral_exam = xport.to_dataframe(f)
with open('DATA\questionaire\medical.XPT', 'rb') as f:
df_medical = xport.to_dataframe(f)
with open('DATA\examination\_body_measure.XPT', 'rb') as f:
df_body_measure = xport.to_dataframe(f)
with open('DATA\examination\diabetes.XPT', 'rb') as f:
df_diabetes = xport.to_dataframe(f)
with open('DATA\examination\liver.XPT', 'rb') as f:
df_liver = xport.to_dataframe(f)
Creating a column decoder dictionary which will be used to change each column name to a more readable format
all_column_decoder = {
# demographic
'SEQN' : 'ID',
'RIAGENDR' : 'gender',
'RIDAGEYR' : 'age_years',
'RIDEXPRG' : 'preg_status',
'DMDEDUC2' : 'education_level',
'INDFMPIR' : 'family_income',
# smoking - https://wwwn.cdc.gov/Nchs/Nhanes/2017-2018/P_SMQ.htm
'SMQ040' : 'sm_freq',
'SMD030' : 'sm_age_started',
'SMD650' : 'sm_amount_per_day',
# physical activity - https://wwwn.cdc.gov/Nchs/Nhanes/2017-2018/P_PAQ.htm
'PAD680' : 'mins_sedentary',
'PAD675' : 'mins_mod_act',
'PAD660' : 'mins_vig_act',
'PAD630' : 'mins_mod_work',
'PAD615' : 'mins_vig_work',
'PAQ605' : 'vig_work',
'PAQ620' : 'mod_work',
'PAQ650' : 'vig_rec',
'PAQ665' : 'mod_rec',
# occupation - https://wwwn.cdc.gov/Nchs/Nhanes/2017-2018/P_OCQ.htm
'OCQ180' : 'total_hours',
# Blood Pressure & Cholesterol - https://wwwn.cdc.gov/Nchs/Nhanes/2017-2018/P_BPQ.htm
'BPQ050A' : 'medication_hbp',
'BPQ100D' : 'medication_hchol',
# Alcohol - https://wwwn.cdc.gov/Nchs/Nhanes/2017-2018/P_ALQ.htm
'ALQ130' : '1yr_avg_per_day',
'ALQ121' : '1yr_freq',
# Oral exam - https://wwwn.cdc.gov/Nchs/Nhanes/2017-2018/P_OHXREF.htm
'OHAROCDT' : 'decayed_teeth',
'OHAROCGP' : 'gum_problems',
# Liver - https://wwwn.cdc.gov/Nchs/Nhanes/2017-2018/P_LUX.htm#LUAXSTAT
'LUXSMED' : 'liver_stiffness',
# Liver - https://wwwn.cdc.gov/Nchs/Nhanes/2017-2018/P_LUX.htm#LUAXSTAT
'DIQ010' : 'diabetes',
'DID040' : 'diabetes_age',
# body measure
'BMXWT' : 'weight_kgs',
'BMXBMI' : 'bmi',
'BMXHT' : 'height_cm',
}
Merge Datasets. 'SEQN' will be used to to join on, as 'SEQN' is a persons ID and is in every dataset
df_to_merge = [df_demog, df_smoking, df_blood_cholest, df_alcohol ,df_occupation,
df_physical_activity, df_oral_health, df_health_insurance,
df_diet_plan_nutrition, df_oral_exam, df_medical,
df_body_measure, df_liver, df_diabetes]
df_merge = reduce(lambda left,right: pd.merge(left,right,on=['SEQN'],
how='outer'),df_to_merge)
Rename columns
columns_to_keep = list(all_column_decoder.keys())
df_all = df_merge[columns_to_keep]
df_all = df_all.rename(columns = all_column_decoder)
Rename row values names to human readable format.
df_all["gender"].replace([1,2],['Male', 'Female'], inplace=True)
df_all["sm_freq"].replace([1, 2, 3],["Daily", "Some Days", "Never"], inplace=True)
df_all["sm_amount_per_day"].replace([777,999], np.nan, inplace=True)
df_all["decayed_teeth"].replace([1,2],['Yes','No'], inplace=True)
df_all["gum_problems"].replace([1,2],['Yes','No'], inplace=True)
df_all["1yr_freq"].replace([0,1,2,3,4,5,6,7,8,9,10, 77, 99],
['None','Everyday', 'Most Days', '3-4 Times a week', '1 per week','2-3 Times a month',
'1 a month',np.nan,np.nan,np.nan,np.nan,np.nan,np.nan], inplace=True)
df_all["1yr_avg_per_day"].replace([777,999],np.nan, inplace=True)
df_all.replace([7777,9999],np.nan, inplace=True)
df_all["medication_hbp"].replace([1,2,7,9],['Yes', 'No', np.nan, np.nan], inplace=True)
df_all["medication_hchol"].replace([1,2,7,9],['Yes', 'No', np.nan, np.nan], inplace=True)
df_all["education_level"].replace([1,2,3,4,5,7,9],
['>9th Grade','Year 9-11', 'High School Grad', 'College Level',
'Above College', np.nan,np.nan], inplace=True)
df_all["education_level"].replace([1,2,7,9],['Yes', 'No',np.nan, np.nan], inplace=True)
df_all['total_hours'].replace([77777,99999],np.nan, inplace=True)
df_all["diabetes"].replace([1,2,3,7,9], ['Yes', 'No', np.nan, np.nan, np.nan], inplace=True)
df = df_all.copy()
Create bins for age
df['age'] = pd.cut(df['age_years'], bins=[20, 30, 40, 50, 60, 70, 80],
labels=["20's", "30's","40's" , "50's", "60's", "70's"], include_lowest=True)
df['abv_bel_50'] = pd.cut(df['age_years'], bins=[1, 50, 90],
labels=['Below 50', 'Above 50'], include_lowest=True)
This section look at the increased frequency of oral problems seen with those that smoke cigarettes
Hypothesis
It is hypothesized that smokers will have higher occurrences of oral problems like decayed teeth or gum related issues. Smoking causes problems throughout the human body, especially the mouth since smoke is inhaled through it, so it is likely that smokers have teeth and gum related issues.
df13 = df.copy()
df13 = df13[df13['sm_freq'].notna()]
df13['sm_status'] = df13.apply(lambda x: "Smokes" if x.sm_freq != "Never" else "Doesn't Smoke", axis=1)
df13['oral_problems'] = df13.apply(lambda x: "Yes" if (x.decayed_teeth == "Yes") | (x.gum_problems == "Yes") else "No", axis=1)
fig = px.line(df13.groupby(
["sm_status", "age"]).size().rename("Counts").reset_index(),
x="age",
y="Counts",
color='sm_status',
title='Smoking status VS Age',
height= 400,
width= 500)
fig.show()
fig = px.histogram(df13.sort_index(ascending=False),
x="age_years",
facet_col = 'oral_problems',
color="sm_status",
nbins=12,
title='Smoking and age effect on oral problems',
height= 400,
width= 900)
fig.update_layout(barmode='overlay')
fig.update_traces(opacity=0.5)
fig.show()
df13.groupby(['oral_problems', 'sm_status']).agg(age_mean= ('age_years', 'mean'),
Oral_problem_count = ('oral_problems', 'size')).round(1)
| age_mean | Oral_problem_count | ||
|---|---|---|---|
| oral_problems | sm_status | ||
| No | Doesn't Smoke | 59.5 | 1554 |
| Smokes | 48.1 | 883 | |
| Yes | Doesn't Smoke | 56.2 | 651 |
| Smokes | 46.2 | 801 |
The above histograms use 3 different feature. We have two histograms, one displaying people with oral problems, and the other no oral problems. Age in years is along the X axis, smoking status for the color, and along the Y axis we have the count. These is also have a line graph displaying the how smoking status changes over ad people get older.
Before analysis of oral problems frequency in smokers, first it should be noted that around the age of 40-50, the amount of smokers declines, which can be observed from the line graph.
A key observation is on the left histogram which is displaying people with oral problems. It can be seen that from ages 20-50, smokers (red) have much a higher frequency of people having oral problems, and ages 60-80s there are much less smokers, so there is a decline in the number fo smokers with oral problems. The frequency of oral problems drops after a peak at 60 years, which may be linked to the decline in people who smoke which can be observed in the line graph.
It can be concluded that smoking plays a role in people having oral problems. At around 60 years, we see the amount of oral problems spike, and incidentally thats the same age when people start to stop smoking, perhaps one of the reason why people stop smoking is because they have increased amounts of oral problems (any likely other health issues from smoking) from many years of smoking.
This section looks at the amount of smokers on medication for high blood pressure (HBP) or high cholesterol (HC). The feature 'sm_amount_per_day' is used, which is the number of cigarettes a person smokes per day, and since we are using this feature we are only getting information on smokers.
Hypothesis
It expected that with will see a higher frequency of smokers being medication for both HBP and HC, since smoking causes increases in both conditions. It is also expected that those on medication will be of higher ages, since HBP and HC generally manifest more as people get older
df_smk = df.copy()
df_bmi2 = df_smk[df_smk['sm_amount_per_day'].notna()]
df_bmi2 = remove_outlier(df_bmi2, 'sm_amount_per_day')
fig = px.scatter(df_bmi2.sort_index(ascending=False),
color="medication_hchol",
x="age_years",
y="sm_amount_per_day",
marginal_x="histogram",
marginal_y="histogram",
symbol="medication_hchol",
symbol_sequence=['star','cross'],
title='Medication for high cholesterol in smokers',
height= 600,
width= 900)
fig.update_traces(marker=dict(size=10,
line=dict(width=1,
color='DarkSlateGrey')),
selector=dict(mode='markers'))
fig.update_traces(opacity=0.7)
fig.show()
df_bmi2.groupby(['medication_hchol']).agg(smoke_mean = ('sm_amount_per_day', 'mean'),
Count = ('medication_hchol', 'size'),
age_mean = ('age_years', 'mean')).round(1)
| smoke_mean | Count | age_mean | |
|---|---|---|---|
| medication_hchol | |||
| No | 11.3 | 132 | 54.1 |
| Yes | 11.3 | 264 | 61.0 |
The above graph uses three features, number of cigarettes per day for the Y axis, age for the X axis, and medication for HC as the color. There are also marginal histograms used on both the X and Y axis to display the differences in frequency of HC medication.
General observations are that most people smoke 11 cigarettes per day. It can be seen that there are many people who smoke either 10 or 20 cigarettes per day, which is seen by the line of points along the X axis at 10 and 20, these are most likely pack a day smokers and half a pack a day smokers. The numbers of cigarettes per day is similar, although it the count of people which is the key observation
It can be observed that after from 50 onwards, the amount of smokers on medication for HC is double the amount of smokers not on HC medication in most cases.
By switching between displaying different colors, the increased number of people on medication is clearly seen.
fig = px.scatter(df_bmi2,
color="medication_hbp",
x="age_years",
y="sm_amount_per_day",
marginal_x="histogram",
marginal_y="histogram",
symbol='medication_hbp',
symbol_sequence=['star','cross'],
title='Medication for high blood pressure in smokers',
height= 600,
width= 900)
fig.update_traces(marker=dict(size=10,
line=dict(width=1,
color='DarkSlateGrey')),
selector=dict(mode='markers'))
fig.update_traces(opacity=0.7)
fig.show()
df_bmi2.groupby(['medication_hbp']).agg(Count = ('medication_hbp', 'size'),
smoke_mean = ('sm_amount_per_day', 'mean'),
age_mean = ('age_years', 'mean'),).round(1)
| Count | smoke_mean | age_mean | |
|---|---|---|---|
| medication_hbp | |||
| No | 136 | 10.4 | 48.3 |
| Yes | 391 | 10.9 | 59.2 |
Similar to the HC graph, it can be seen that there are many more people who smoke and are on medication for HBP.
Across all age groups after 35 years, we see more people being on medication for HBP than not.
It can be concluded that smoking likely causes adverse physiological conditions that result in both HBP and HC
This section takes a look at increased liver stiffness, which is associated with liver fibrosis, and its increased in those with higher BMI's and ages, different genders, and if they have diabetes or not
Hypothesis
It is suspected that those with higher BMI's will have increased levels of liver stiffness since a higher BMI generally causes inflammation of organs, which can cause increased liver stiffness over time. Additionally it is suspected that a those with higher ages will have increased liver stiffness, since liver conditions can take many years to manifest. Diabetes may increase liver stiffness as diabetes can effect many parts of the human body, including the liver.
df1 = df[df['liver_stiffness'].notna()]
df1 = remove_outlier(df1, 'bmi')
fig = px.scatter(df1,
color="diabetes",
y="age_years",
x="bmi",
title='Liver Stiffness, Diabetes, and BMI',
size='liver_stiffness',
width= 1200,
)
fig.show()
df1.groupby(['diabetes']).agg(Liver_stiffness_mean= ('liver_stiffness', 'mean'),
BMI_mean = ('bmi', 'mean'),
Diabetes_age_mean = ('age_years', 'mean')).round(1)
| Liver_stiffness_mean | BMI_mean | Diabetes_age_mean | |
|---|---|---|---|
| diabetes | |||
| No | 5.4 | 27.9 | 41.3 |
| Yes | 7.5 | 31.4 | 62.4 |
The above scatter plot uses 5 different features. We see the a persons age for the Y axis, BMI for the X axis, liver stiffness for the size of each point, and diabetes for the color.
The most obvious take away is the occurrence of diabetes is older people. Which can be seen from the large cluster of red points. The bulk those with diabetes are roughly 60 years old, and we find very little occurrences of people with diabetes less than 50 years old, with a noticeably sharp increase above this age. It can be induced that diabetes is a disease which is more prevalent in older people, likely because the disease itself takes many years to manifest. Perhaps after a person has lived for 50 years, their pancreas begins to function less efficiently, and develops adverse conditions which effects its ability to control blood sugar levels.
In addition to age, we can see that a persons BMI also has an effect on diabetes. A healthy BMI is around 18.5 - 24.9, and interestingly we see the occurrence of diabetes increase between 20 and 30 along the Y axis. This indicates that a higher BMI is also a significant factor in having diabetes. It can be induced that those with higher BMI's likely live an unhealthier lifestyle, and now that they have reached an older age, their pancreas has had an increased 'workload' throughout their lives, and has now began to perform inefficiently, with functional problems.
Increased liver stiffness is generally low for ages 10-45, In areas where BMI is greater than 25, and ages above 40-50, there is a large area where liver stiffness is increased. By observing the top right are of the graph (higher BMI add ages), it can be clearly seen that much larger points (which is higher liver stiffness), which points to a relationship between the two features. It can be induced that higher BMI's and ages is also a contributing factor. Similar to the pancreas, the liver likely begins to develop adverse functional problems, and conditions arise that causes stiffer livers. Notably, older people with healthier BMI's seem to have less stiffer livers, and as BMI increases to levels above 25, we see increased liver stiffness. It is also worth observing that most of the outliers for BMI have stiffer livers, even at lower ages.
The table below the graph helps conclude a few points. Those with diabetes generally have increased liver stiffness, and higher BMI's, and is also likely to be older.
fig = px.scatter(df1,
color="diabetes",
y="age_years",
x="bmi",
size='liver_stiffness',
facet_col='gender',
title='Liver stiffness and diabetes in Males and Females',
)
fig.show()
df1.groupby(['gender', 'diabetes']).agg(Liver_stiffness_mean= ('liver_stiffness', 'mean'),
BMI_mean = ('bmi', 'mean'),
Diabetes_count = ('diabetes', 'size'),
Diabetes_age_mean = ('age_years', 'mean')).round(1)
| Liver_stiffness_mean | BMI_mean | Diabetes_count | Diabetes_age_mean | ||
|---|---|---|---|---|---|
| gender | diabetes | ||||
| Female | No | 5.1 | 28.2 | 4022 | 42.0 |
| Yes | 6.7 | 32.1 | 490 | 60.9 | |
| Male | No | 5.8 | 27.5 | 3971 | 40.6 |
| Yes | 8.0 | 30.9 | 621 | 63.6 |
Additionally we can have a look how gender influences liver stiffness, age, bmi, and diabetes.
Notable differences are that males appear to have higher occurrences of diabetes, and over liver stiffness. By clicking the graph so we only see those with diabetes, we see that males have significantly more points, and the table below shows over 110 more males having diabetes. Additionally we see from the size of each point that males have increased liver stiffness compared to females. The table shows that males have a greater mean of liver stiffness, regardless of having diabetes.
Interestingly males have lower BMI's on average, although have increased liver stiffness and occurrences of diabetes. Its possible to induce that diabetes plays a greater part in liver stiffness compared to BMI, and that is why men have higher liver stiffness, although there are likely other factors in play.
This section looks into how the amount of moderate or vigorous activity in a work or recreational setting changes when people smoke and don't smoke, and if their age has additional influence or if BMI is affected. Four graphs will be generated that will look at different types of activity: vigorous recreational activity, moderate recreational activity, vigorous work activity, and moderate work activity
Daily smokers and non smokers are only being considered, not people who only smoke some days
Hypothesis
It is hypothesised that people who smoke will not perform as much vigorous activity at older ages. Since a lifetime of smoking effects a person lungs and general health, its reasonable smokers will not be performing as much vigorous recreational activity. Additionally it also also reasonable to assume that older people who are performing more vigorous activity are likely to care more about their health.
Additionally it was hypothesised that smokers will have more minutes of physical activity and higher BMI's, although the results showed otherwise.
df7 = df.copy()
df7 = df7[df7['sm_freq'].notna()]
df7 = df7[df7['sm_freq']!='Some Days']
df7 = df7[df7['bmi'].notna()]
df7['sm_status'] = df7.apply(lambda x: "Smokes" if x.sm_freq != "Never" else "Doesn't Smoke", axis=1)
df8=df7.copy()
df9=df7.copy()
df10=df7.copy()
df7= df7[df7['mins_vig_act'].notna()]
fig = px.scatter(df7,
y='age_years',
x='bmi',
color='mins_vig_act',
color_continuous_scale = 'burg',
facet_col = 'sm_status',
title='Vigorous recreational activity of smokers VS non smokers',
size='mins_vig_act',
width=1000,
height=500,
opacity=0.9,
)
fig.show()
g = df7.groupby(['sm_status', 'age']).agg(Mins_Vig_Activity = ('mins_vig_act', 'mean'),
Count = ('age', 'size')).round(1)
g2 = df7.groupby(['sm_status']).agg(Average_Vig_Activity = ('mins_vig_act', 'mean'),
Average_Age_years = ('age_years', 'mean'),
Count = ('age', 'size')).round(1)
side_by_side(g,g2)
| Mins_Vig_Activity | Count | ||
|---|---|---|---|
| sm_status | age | ||
| Doesn't Smoke | 20's | 67.2 | 59 |
| 30's | 78.2 | 84 | |
| 40's | 61.9 | 68 | |
| 50's | 68.5 | 62 | |
| 60's | 73.9 | 76 | |
| 70's | 73.7 | 47 | |
| Smokes | 20's | 97.3 | 50 |
| 30's | 81.1 | 50 | |
| 40's | 84.2 | 30 | |
| 50's | 76.6 | 16 | |
| 60's | 66.9 | 8 | |
| 70's | 37.5 | 2 |
| Average_Vig_Activity | Average_Age_years | Count | |
|---|---|---|---|
| sm_status | |||
| Doesn't Smoke | 71.5 | 49.1 | 401 |
| Smokes | 84.9 | 37.4 | 162 |
The above two scatter plots use 4 features, age along the Y axis, BMI along the X axis, for size of each point there is minutes of vigorous activity, and for a facet column there is smoking status.
An interesting observation that can be made is the amount of vigorous activity seen in younger smokers. The right graph has a dense cluster of points in the bottom left, where most points have a larger size, which tells us that younger people of ages who smoke, have relatively lower BMI's and perform more minutes of vigorous activity. Its not simple to induce a reason why this is the case, perhaps since people tend to smoke in there younger years (which has already been observed in the oral smoking analysis), have more capable bodies/lungs are are still able to perform more vigorous activity.
The table displays some interesting results. For smokers we see that those in there 20's perform the most vigorous activity on average, which we can also easily see in the graph. In age groups from 20's to 40's, smoker have more minutes of vigorous activity, with smokers having over 30 minutes more vigorous activity in their 40's than non smokers.
df8= df8[df8['mins_mod_act'].notna()]
fig = px.scatter(df8.sort_index(ascending=False),
y='age_years',
x='bmi',
color='mins_mod_act',
color_continuous_scale = 'burg',
facet_col = 'sm_status',
title='Moderate recreational activity of smokers VS non smokers ',
size='mins_mod_act',
width=1000,
height=500,
opacity=0.9,
)
fig.show()
g = df8.groupby(['sm_status', 'age']).agg(Mins_Mod_Activity = ('mins_mod_act', 'mean'),
Count = ('age', 'size')).round(1)
g2 = df8.groupby(['sm_status']).agg(Average_Mod_Activity = ('mins_mod_act', 'mean'),
Average_Age_years = ('age_years', 'mean'),
Count = ('age', 'size')).round(1)
side_by_side(g,g2)
| Mins_Mod_Activity | Count | ||
|---|---|---|---|
| sm_status | age | ||
| Doesn't Smoke | 20's | 69.7 | 76 |
| 30's | 67.8 | 104 | |
| 40's | 57.6 | 118 | |
| 50's | 72.6 | 131 | |
| 60's | 66.6 | 186 | |
| 70's | 77.7 | 170 | |
| Smokes | 20's | 78.2 | 68 |
| 30's | 93.2 | 79 | |
| 40's | 83.6 | 66 | |
| 50's | 76.2 | 72 | |
| 60's | 103.9 | 44 | |
| 70's | 71.5 | 20 |
| Average_Mod_Activity | Average_Age_years | Count | |
|---|---|---|---|
| sm_status | |||
| Doesn't Smoke | 69.5 | 55.3 | 791 |
| Smokes | 84.6 | 45.2 | 354 |
The graphs above look at moderate activity of smokers and non smokers.
Again there is a similar theme where smokers tend to perform more activity than non smokers. Although with moderate activity, we see more occurrence of smokers performing more moderate recreational activity at ages 60 compared to vigorous recreational activity. We also do not have a cluster of young smokers who complete significant amounts of activity in there 20-30s
df9= df9[df9['mins_mod_work'].notna()]
fig = px.scatter(df9.sort_index(ascending=False),
y='age_years',
x='bmi',
color='mins_mod_work',
color_continuous_scale = 'burg',
facet_col = 'sm_status',
title='Moderate work activity of smokers VS non smokers',
size='mins_mod_work',
width=1000,
height=500,
opacity=0.9,
)
fig.show()
g = df9.groupby(['sm_status', 'age']).agg(Mins_Mod_Work = ('mins_mod_work', 'mean'),
Count = ('age', 'size')).round(1)
g2 = df9.groupby(['sm_status']).agg(Average_Mod_Work = ('mins_mod_work', 'mean'),
Average_Age_years = ('age_years', 'mean'),
Count = ('age', 'size')).round(1)
side_by_side(g,g2)
| Mins_Mod_Work | Count | ||
|---|---|---|---|
| sm_status | age | ||
| Doesn't Smoke | 20's | 218.8 | 96 |
| 30's | 175.1 | 115 | |
| 40's | 166.5 | 129 | |
| 50's | 192.2 | 142 | |
| 60's | 135.6 | 230 | |
| 70's | 134.8 | 189 | |
| Smokes | 20's | 254.1 | 123 |
| 30's | 222.2 | 137 | |
| 40's | 215.2 | 116 | |
| 50's | 184.3 | 124 | |
| 60's | 156.3 | 80 | |
| 70's | 141.3 | 31 |
| Average_Mod_Work | Average_Age_years | Count | |
|---|---|---|---|
| sm_status | |||
| Doesn't Smoke | 163.1 | 55.1 | 907 |
| Smokes | 206.3 | 44.9 | 618 |
Here we are looking at work rather than recreational activity
It can be seen that again we find that smokers perform greater number of minutes of activity than non smokers. The right graph clearly has larger and darker points, indicating a greater number of minutes.
The same theme of less number of smokers at higher ages is seen.
df10= df10[df10['mins_vig_work'].notna()]
fig = px.scatter(df10.sort_index(ascending=False),
y='age_years',
x='bmi',
color='mins_vig_work',
color_continuous_scale = 'burg',
facet_col = 'sm_status',
title='Vigorous work activity of smokers VS non smokers ',
size='mins_vig_work',
width=1000,
height=500,
opacity=0.9,
)
fig.show()
g = df10.groupby(['sm_status', 'age']).agg(Mins_Mod_Activity = ('mins_vig_work', 'mean'),
Count = ('age', 'size')).round(1)
g2 = df10.groupby(['sm_status']).agg(Average_Vig_Work = ('mins_vig_work', 'mean'),
Average_Age_years = ('age_years', 'mean'),
Count = ('age', 'size')).round(1)
side_by_side(g,g2)
| Mins_Mod_Activity | Count | ||
|---|---|---|---|
| sm_status | age | ||
| Doesn't Smoke | 20's | 235.9 | 56 |
| 30's | 203.9 | 78 | |
| 40's | 165.2 | 81 | |
| 50's | 189.0 | 89 | |
| 60's | 162.2 | 119 | |
| 70's | 158.1 | 65 | |
| Smokes | 20's | 256.3 | 86 |
| 30's | 243.9 | 109 | |
| 40's | 266.1 | 92 | |
| 50's | 195.8 | 75 | |
| 60's | 151.2 | 46 | |
| 70's | 258.5 | 13 |
| Average_Vig_Work | Average_Age_years | Count | |
|---|---|---|---|
| sm_status | |||
| Doesn't Smoke | 181.9 | 52.1 | 492 |
| Smokes | 232.7 | 43.2 | 426 |
As with all other types of activity, those that smoke complete do more minutes of vigorous work related activity.
However, with this type of activity it can be observed that smokers in all age groups on average have higher number of vigorous work activity apart from people on their 70's, which can be seen in the left table and also at the top portion of each graph.
Overall it is somewhat surprising that smokers on average perform significantly more minutes of all kinds of activity. It may be a reasonable to assume that smokers would do less activity, since physical activity is generally seen as a healthy activity, and smoking is a very unhealthy activity. Although, this information produced indicates otherwise.
Some conclusions can be drawn about why smokers complete more activity:
This section investigates how a persons income changes when they have different levels of education, and if the have certain 'poor' lifestyle habits, like smoking and excessive drinking. Income is based on the poverty level a person has, with 5 being the least affected by poverty since they have higher incomes, and 0 being the most affected because they have lower incomes.
Hypothesis
It is likely that those that have poor lifestyles will not have as high income. This a a reasonable hypothesis as is is expensive to drink and smoke, and it is difficult to support and high paying job whilst drinking excessively. Since smokers are essentially addicted to cigarettes, and addiction can be associated with less discipline and poorer decision making skills, an argument can be made that smokers are likely to have traits that make it harder for them to support higher paying job. This argument poses ethical concerns since a lot of assumptions are being made and a person just because they smoke. Those that don't have poor lifestyle habits are less likely to be distracted by drinking or smoking, and are able to focus on their careers/businesses to help them generate more income.
It is also expected that those with college education or higher will have greater income that those with education from highschool to lower than year 9. Those with higher education have more access to high paying job because of their degrees.
df_edu1 = df.loc[df['education_level'].notna()]
df_edu1 = df_edu1[df_edu1['sm_freq'].notna()]
df_edu1['sm_status'] = df_edu1.apply(lambda x: "Smokes" if x.sm_freq != "Never" else "Doesn't Smoke", axis=1)
df_edu1['heavy_drinker'] = df_edu1.apply(lambda x: "Yes" if
(x['1yr_freq'] in ['Everyday', 'Most Days', '3-4 Times a week']) |
(x['1yr_avg_per_day'] >= 3) else "No", axis=1)
df_edu1['life_style_habits'] = df_edu1.apply(lambda x: "poor" if (x.sm_status == 'Yes') |
(x.heavy_drinker == "Yes")
else "good", axis=1)
df_edu1['College_Edu'] = df_edu1.apply(lambda x: "Yes" if
(x['education_level'] in ['College Level', 'Above College'])
else "No", axis=1)
df_edu1 = remove_outlier(df_edu1, 'bmi')
fig = px.density_heatmap(df_edu1,
x="family_income",
y='age_years',
facet_col="College_Edu",
facet_row="life_style_habits",
height= 800,
width= 800,
title = "Income, Age, Lifestyle type, and Education level",
)
fig.show()
The above density heatmap uses 4 different features. Family income along the X axis, Age along the Y, with education as a column facet, and poor lifestyle as a row facet. The furthest right section of each heatmap is where a person has an greatest income, and the furthest left is the worst income.
It can be seen in the above graphs that those without poorer lifestyles and higher education clearly have great frequency of higher incomes, and those with poor lifestyles and no education, have a high frequency of lower income.
Firstly it can be seen that Age is a significant factor for a person income, most most people who have higher incomes are above 40
The top right shows how a person who has college education, and does not have poor lifestyle choices, has a higher income. The yellow/orange indicates a high number of people who have higher income. Additionally we see less people with an income level around 2 compared to non college educated people. As hypothesised, this is likely due to the fact that higher education allows for people to get higher paying jobs, and since they don't have the distractions and negative traits associated with drinking and smoking.
The bottom right shows that there is less amounts of people with college education who have poorer lifestyles, although we don't see higher number with income level of 5 than any other graph apart from the top right. Heavy Drinkers and smokers with college education still have access to higher paying jobs, although it appears that poor habits make it less likely to have higher income.
It can be seen that people with less education mostly have lower income, which in mainly seen the top left graph, with most cells with income level around 1-2 having the highest count.